library(dplyr)
library(caret)
library(lattice)
library(ggplot2)
library(stats)
library(Hmisc)
library(car)
library(leaps)
library(outliers)
library(kknn)
library(knitr)
library(psych)
library(corrplot)
library(tidyverse)
library(cli)
library(devtools)
library(rpart)
library(pROC)
library(nnet)
library(foreign)
library(reshape2)
library(lattice)
library(ggplot2)
library(comprehenr)
library(gridExtra)
library(MASS)
library(aod)
library(e1071)
library(factoextra)
#conda install -c conda-forge r-factoextra
library(class)
library(randomForest)
library(rpart)
library(rpart.plot)
library(gbm)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
Loading required package: lattice
Loading required package: ggplot2
Registered S3 methods overwritten by 'ggplot2':
method from
[.quosures rlang
c.quosures rlang
print.quosures rlang
Loading required package: survival
Attaching package: ‘survival’
The following object is masked from ‘package:caret’:
cluster
Loading required package: Formula
Attaching package: ‘Hmisc’
The following objects are masked from ‘package:dplyr’:
src, summarize
The following objects are masked from ‘package:base’:
format.pval, units
Loading required package: carData
Attaching package: ‘car’
The following object is masked from ‘package:dplyr’:
recode
Attaching package: ‘kknn’
The following object is masked from ‘package:caret’:
contr.dummy
Attaching package: ‘psych’
The following object is masked from ‘package:outliers’:
outlier
The following object is masked from ‘package:car’:
logit
The following object is masked from ‘package:Hmisc’:
describe
The following objects are masked from ‘package:ggplot2’:
%+%, alpha
corrplot 0.84 loaded
Registered S3 method overwritten by 'rvest':
method from
read_xml.response xml2
── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
✔ tibble 2.1.1 ✔ purrr 0.3.2
✔ tidyr 0.8.3 ✔ stringr 1.4.0
✔ readr 1.3.1 ✔ forcats 0.4.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ psych::%+%() masks ggplot2::%+%()
✖ psych::alpha() masks ggplot2::alpha()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
✖ purrr::lift() masks caret::lift()
✖ car::recode() masks dplyr::recode()
✖ purrr::some() masks car::some()
✖ Hmisc::src() masks dplyr::src()
✖ Hmisc::summarize() masks dplyr::summarize()
Type 'citation("pROC")' for a citation.
Attaching package: ‘pROC’
The following objects are masked from ‘package:stats’:
cov, smooth, var
Attaching package: ‘reshape2’
The following object is masked from ‘package:tidyr’:
smiths
Attaching package: ‘gridExtra’
The following object is masked from ‘package:dplyr’:
combine
Attaching package: ‘MASS’
The following object is masked from ‘package:dplyr’:
select
Attaching package: ‘aod’
The following object is masked from ‘package:survival’:
rats
Attaching package: ‘e1071’
The following object is masked from ‘package:Hmisc’:
impute
Warning message:
“package ‘factoextra’ was built under R version 3.6.3”Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.
Attaching package: ‘randomForest’
The following object is masked from ‘package:gridExtra’:
combine
The following object is masked from ‘package:psych’:
outlier
The following object is masked from ‘package:outliers’:
outlier
The following object is masked from ‘package:ggplot2’:
margin
The following object is masked from ‘package:dplyr’:
combine
Loaded gbm 2.1.5
#setwd("~/Downloads/Datasets/7406/Project")
white <- read.csv(file = "winequality_white.csv", head = TRUE, sep=";")
red <- read.csv(file = "winequality_red.csv", head = TRUE, sep=";")
wine <- red
summary(wine)
head(wine)
str(wine)
fixed.acidity volatile.acidity citric.acid residual.sugar
Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
chlorides free.sulfur.dioxide total.sulfur.dioxide density
Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
pH sulphates alcohol quality
Min. :2.740 Min. :0.3300 Min. : 8.40 Min. :3.000
1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
Median :3.310 Median :0.6200 Median :10.20 Median :6.000
Mean :3.311 Mean :0.6581 Mean :10.42 Mean :5.636
3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
Max. :4.010 Max. :2.0000 Max. :14.90 Max. :8.000
| fixed.acidity | volatile.acidity | citric.acid | residual.sugar | chlorides | free.sulfur.dioxide | total.sulfur.dioxide | density | pH | sulphates | alcohol | quality |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11 | 34 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25 | 67 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 |
| 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15 | 54 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 |
| 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17 | 60 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 |
| 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11 | 34 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 7.4 | 0.66 | 0.00 | 1.8 | 0.075 | 13 | 40 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
'data.frame': 1599 obs. of 12 variables: $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ... $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ... $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ... $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ... $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ... $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ... $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ... $ density : num 0.998 0.997 0.997 0.998 0.998 ... $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ... $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ... $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ... $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
nobs <- nrow(wine)
print(paste('White: ',dim(white)))
print(paste('Red: ', dim(red)))
na_count <- sapply(wine, function(y) sum(length(which(is.na(y)))))
na_count
[1] "White: 4898" "White: 12" [1] "Red: 1599" "Red: 12"
par(mfrow = c(4,3))
hist(wine$fixed.acidity)
hist(wine$volatile.acidity)
hist(wine$citric.acid)
hist(wine$residual.sugar)
hist(wine$chlorides)
hist(wine$free.sulfur.dioxide)
hist(wine$total.sulfur.dioxide)
hist(wine$density)
hist(wine$pH)
hist(wine$sulphates)
hist(wine$alcohol)
hist(wine$quality)
par(mfrow = c(2,2))
qqnorm(wine$quality, pch = 1, frame = FALSE, main = 'QQ-plot - QUALITY')
qqline(wine$quality, col = "blue", lwd = 2)
boxplot(wine$quality, main = 'Original data - Outliers')
grubbs.test(wine$quality, type =10)
Grubbs test for one outlier data: wine$quality G = 3.26414, U = 0.99333, p-value = 0.8626 alternative hypothesis: lowest value 3 is an outlier
Our dataset does not have any missing values.
Our response variable "Quality" is a categorical one with the ranks from 1 to 10. The range of values (min/max) across the variables does not require scaling or normalization. All predictors are numerical.
corrplot(cor(wine[1:(length(wine))]), method="number")
Covariance Matrix
There is some significant multicollineraity between several variables:
density - alcohol total.sulfur.dioxide - free.sulfur.dioxide density - residual.sugar residual.sugar - total.sulfur.dioxide density - fixed.acidity
Certain predictors will have to be removed. Most likely candidates are:
free.sulfur.dioxide residual.sugar fixed.acidity alcohol
par(mfrow = c(2,2))
hist(wine$quality,xlab = 'Quality', main = "Distribution of Red Wine Quality")
#hist(wine[wine$type == "white",]$quality,xlab = 'Quality', main = "Distribution of White Wine Quality")
#hist(wine[wine$type == "red",]$quality,xlab = 'Quality', main = "Distribution of Red Wine Quality")
attach(wine)
par(mfrow = c(3,2))
plot(density, fixed.acidity)
plot(volatile.acidity, total.sulfur.dioxide)
plot(residual.sugar, density)
plot(residual.sugar, total.sulfur.dioxide)
plot(free.sulfur.dioxide, total.sulfur.dioxide)
plot(density, alcohol)
par(mfrow = c(3,3))
boxplot(fixed.acidity ~ quality)
boxplot(volatile.acidity ~ quality)
boxplot(citric.acid ~ quality)
boxplot(residual.sugar ~ quality)
boxplot(chlorides ~ quality)
boxplot(free.sulfur.dioxide ~ quality)
boxplot(total.sulfur.dioxide ~ quality)
boxplot(density ~ quality)
boxplot(pH ~ quality)
boxplot(sulphates ~ quality)
boxplot(alcohol ~ quality)
Data Preparation
#wine <- rbind(white, red)
wine.pr <- wine[1:11]
wine$quality <- ifelse(wine$quality>= 6, 1,0)
wine$quality <- as.factor(wine$quality)
summary(wine)
#Random sampling
set.seed(7406)
test <- 0.8
ff <- floor((1 - test)*nrow(wine))
ind <- sample(nrow(wine), ff, replace = FALSE)
# Training and Test datasets
wine_train <- wine[ind,]
wine_test <- wine[-ind,]
fixed.acidity volatile.acidity citric.acid residual.sugar
Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
chlorides free.sulfur.dioxide total.sulfur.dioxide density
Min. :0.01200 Min. : 1.00 Min. : 6.00 Min. :0.9901
1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956
Median :0.07900 Median :14.00 Median : 38.00 Median :0.9968
Mean :0.08747 Mean :15.87 Mean : 46.47 Mean :0.9967
3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978
Max. :0.61100 Max. :72.00 Max. :289.00 Max. :1.0037
pH sulphates alcohol quality
Min. :2.740 Min. :0.3300 Min. : 8.40 0:744
1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50 1:855
Median :3.310 Median :0.6200 Median :10.20
Mean :3.311 Mean :0.6581 Mean :10.42
3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10
Max. :4.010 Max. :2.0000 Max. :14.90
MODELING
We will start with converting scale values for "quality" from 1-10 to a binary system (0,1). A 1 will represent good wine and a 0 will represent "bad" wine. We will analyze initial dataset using ordinal logistic regression at the end of this research.
At first, we will use logistic regression, KNN, SVM, Trees etc..
For Logistic regression, we romoved the following non-significant predictors:
density fixed.acidity residual.sugar
This is well-aligned with the conclusions from the covariance matrix
lgall <- glm(quality ~ ., data = wine, family = binomial(link="logit"))
summary(lgall)
Call:
glm(formula = quality ~ ., family = binomial(link = "logit"),
data = wine)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.4025 -0.8387 0.3105 0.8300 2.3142
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 42.949948 79.473979 0.540 0.58890
fixed.acidity 0.135980 0.098483 1.381 0.16736
volatile.acidity -3.281694 0.488214 -6.722 1.79e-11 ***
citric.acid -1.274347 0.562730 -2.265 0.02354 *
residual.sugar 0.055326 0.053770 1.029 0.30351
chlorides -3.915713 1.569298 -2.495 0.01259 *
free.sulfur.dioxide 0.022220 0.008236 2.698 0.00698 **
total.sulfur.dioxide -0.016394 0.002882 -5.688 1.29e-08 ***
density -50.932385 81.148745 -0.628 0.53024
pH -0.380608 0.720203 -0.528 0.59717
sulphates 2.795107 0.452184 6.181 6.36e-10 ***
alcohol 0.866822 0.104190 8.320 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2209.0 on 1598 degrees of freedom
Residual deviance: 1655.6 on 1587 degrees of freedom
AIC: 1679.6
Number of Fisher Scoring iterations: 4
wald.test(Sigma = vcov(lgall), b = coef(lgall), Terms = 1:length(lgall$coef))
Wald test: ---------- Chi-squared test: X2 = 356.5, df = 12, P(> X2) = 0.0
Since p-value = 0.0, then there are no non-significant coefficients.
aiclgall <- round(AIC(lgall),1)
biclgall <- round(BIC(lgall),1)
print(paste('AIC = ', aiclgall, ' BIC = ', biclgall))
[1] "AIC = 1679.6 BIC = 1744.2"
print('Significance Ranking')
round(vif(lgall),3)
[1] "Significance Ranking"
# Confidence Intervals
exp(cbind(OR = coef(lgall), confint(lgall)))
Waiting for profiling to be done...
| OR | 2.5 % | 97.5 % | |
|---|---|---|---|
| (Intercept) | 4.497027e+18 | 9.875179e-50 | 2.611259e+86 |
| fixed.acidity | 1.145659e+00 | 9.453487e-01 | 1.391245e+00 |
| volatile.acidity | 3.756458e-02 | 1.417753e-02 | 9.623115e-02 |
| citric.acid | 2.796134e-01 | 9.208797e-02 | 8.372148e-01 |
| residual.sugar | 1.056885e+00 | 9.505294e-01 | 1.174768e+00 |
| chlorides | 1.992634e-02 | 8.561776e-04 | 4.097313e-01 |
| free.sulfur.dioxide | 1.022469e+00 | 1.006133e+00 | 1.039179e+00 |
| total.sulfur.dioxide | 9.837397e-01 | 9.780924e-01 | 9.892261e-01 |
| density | 7.591825e-23 | 4.731209e-92 | 8.931309e+46 |
| pH | 6.834461e-01 | 1.670776e-01 | 2.818561e+00 |
| sulphates | 1.636437e+01 | 6.881270e+00 | 4.054391e+01 |
| alcohol | 2.379338e+00 | 1.943967e+00 | 2.925369e+00 |
print('Error')
prediction <- predict(lgall,wine[,1:11], type = "response")
prediction <- ifelse(prediction>= .5, 1,0)
[1] "Error"
cm <- confusionMatrix(wine$quality, as.factor(prediction))
print(cm)
print(paste('Accuracy = ', round(cm$overall['Accuracy'],3) ))
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 549 195
1 214 641
Accuracy : 0.7442
95% CI : (0.7221, 0.7654)
No Information Rate : 0.5228
P-Value [Acc > NIR] : <2e-16
Kappa : 0.4868
Mcnemar's Test P-Value : 0.3734
Sensitivity : 0.7195
Specificity : 0.7667
Pos Pred Value : 0.7379
Neg Pred Value : 0.7497
Prevalence : 0.4772
Detection Rate : 0.3433
Detection Prevalence : 0.4653
Balanced Accuracy : 0.7431
'Positive' Class : 0
[1] "Accuracy = 0.744"
lgsig <- glm(quality ~ .- residual.sugar - fixed.acidity - density - citric.acid - pH,
data = wine, family = binomial(link="logit"))
summary(lgsig)
Call:
glm(formula = quality ~ . - residual.sugar - fixed.acidity -
density - citric.acid - pH, family = binomial(link = "logit"),
data = wine)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.0827 -0.8539 0.3241 0.8369 2.3080
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -8.149844 0.809006 -10.074 < 2e-16 ***
volatile.acidity -2.895939 0.370675 -7.813 5.60e-15 ***
chlorides -4.421027 1.432052 -3.087 0.00202 **
free.sulfur.dioxide 0.023862 0.007987 2.988 0.00281 **
total.sulfur.dioxide -0.017505 0.002707 -6.468 9.95e-11 ***
sulphates 2.705876 0.427543 6.329 2.47e-10 ***
alcohol 0.859379 0.070725 12.151 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2209.0 on 1598 degrees of freedom
Residual deviance: 1664.7 on 1592 degrees of freedom
AIC: 1678.7
Number of Fisher Scoring iterations: 4
wald.test(Sigma = vcov(lgsig), b = coef(lgsig), Terms = 1:length(lgsig$coef))
Wald test: ---------- Chi-squared test: X2 = 353.6, df = 7, P(> X2) = 0.0
aiclgsig <- round(AIC(lgsig),1)
biclgsig <- round(BIC(lgsig),1)
print(paste('AIC = ', aiclgsig, ' BIC = ', biclgsig))
[1] "AIC = 1678.7 BIC = 1716.4"
print('Significance Ranking')
round(vif(lgsig),3)
[1] "Significance Ranking"
exp(cbind(OR = coef(lgsig), confint(lgsig)))
Waiting for profiling to be done...
| OR | 2.5 % | 97.5 % | |
|---|---|---|---|
| (Intercept) | 2.887804e-04 | 5.790494e-05 | 0.001382764 |
| volatile.acidity | 5.524713e-02 | 2.647643e-02 | 0.113314431 |
| chlorides | 1.202188e-02 | 6.719787e-04 | 0.188811030 |
| free.sulfur.dioxide | 1.024149e+00 | 1.008293e+00 | 1.040386786 |
| total.sulfur.dioxide | 9.826471e-01 | 9.773245e-01 | 0.987765461 |
| sulphates | 1.496742e+01 | 6.599082e+00 | 35.310209138 |
| alcohol | 2.361693e+00 | 2.060864e+00 | 2.719686654 |
print('Error')
predictions <- predict(lgsig,wine[,1:11], type = "response")
predictions <- ifelse(predictions >= .5, 1,0)
cm <- confusionMatrix(wine$quality, as.factor(predictions))
print(cm)
print(paste('Accuracy = ', round(cm$overall['Accuracy'],3) ))
[1] "Error"
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 559 185
1 215 640
Accuracy : 0.7498
95% CI : (0.7279, 0.7709)
No Information Rate : 0.5159
P-Value [Acc > NIR] : <2e-16
Kappa : 0.4986
Mcnemar's Test P-Value : 0.1471
Sensitivity : 0.7222
Specificity : 0.7758
Pos Pred Value : 0.7513
Neg Pred Value : 0.7485
Prevalence : 0.4841
Detection Rate : 0.3496
Detection Prevalence : 0.4653
Balanced Accuracy : 0.7490
'Positive' Class : 0
[1] "Accuracy = 0.75"
res = resid(lgsig,type="deviance")
par(mfrow=c(2,2))
qqnorm(res, ylab="Std residuals")
hist(res,10,xlab="Std residuals", main="")
#wine.pr$quality <- as.numeric(wine.pr$quality)
num <- 15
nst <- 6 #Initial number of centroids
res_fin <- matrix(0, num, 1)
for (k in 1:num) {
kmd <- kmeans(wine.pr, k, nstart = nst)
kvl <- kmd$tot.withinss
res_fin[k] <- kvl
}
plot(res_fin, type = 'l', ylab = 'WSS', xlab = 'K',
main = 'Optimal K value for KMEANS')
Warning message: “did not converge in 10 iterations”Warning message: “did not converge in 10 iterations”Warning message: “did not converge in 10 iterations”Warning message: “did not converge in 10 iterations”Warning message: “did not converge in 10 iterations”Warning message: “did not converge in 10 iterations”
# WSS Method
kbest <- 5 # The best K Value
fitK = kmeans(wine.pr, kbest, nstart = 5)
plot(wine,col = fitK$cluster)
ggplot(wine, aes(x = free.sulfur.dioxide, y = volatile.acidity, shape = quality, col = quality))+geom_point()
ggplot(wine, aes(x = free.sulfur.dioxide, y = alcohol, shape = quality, col = quality))+geom_point()
ggplot(wine, aes(x = free.sulfur.dioxide, y = sulphates, shape = quality, col = quality))+geom_point()
ggplot(wine, aes(x = chlorides, y = sulphates, shape = quality, col = quality))+geom_point()
ggplot(wine, aes(x = chlorides, y = alcohol, shape = quality, col = quality))+geom_point()
ggplot(wine, aes(x = sulphates, y = alcohol, shape = quality, col = quality)) + geom_point()
ggplot(wine, aes(x = free.sulfur.dioxide, y = chlorides, shape = quality,
col = quality))+geom_point()
ggplot(wine, aes(x = volatile.acidity, y = chlorides, shape = quality, col = quality))+geom_point()
ggplot(wine, aes(x = volatile.acidity, y = sulphates, shape = quality, col = quality))+geom_point()
ggplot(wine, aes(x = volatile.acidity, y = alcohol, shape = quality, col = quality))+geom_point()
# The data is not separable. Thus KNN is preferred choice over SVM
fitK
K-means clustering with 5 clusters of sizes 446, 598, 292, 193, 70
Cluster means:
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
1 8.232960 0.5264126 0.2422197 2.397085 0.08646861
2 8.621070 0.5129599 0.2875753 2.425251 0.08402174
3 8.229795 0.5203767 0.2750685 2.405993 0.09495205
4 7.817098 0.5772021 0.2607254 3.090155 0.08837824
5 8.057143 0.5586429 0.3235714 3.445714 0.08951429
free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
1 15.422646 38.91704 0.9966850 3.328924 0.6471749
2 7.478261 18.48997 0.9966374 3.299682 0.6537960
3 25.020548 61.59932 0.9968999 3.324247 0.6857534
4 23.227979 94.23316 0.9968525 3.313316 0.6392228
5 32.064286 138.77143 0.9971420 3.234429 0.7022857
alcohol
1 10.444283
2 10.642252
3 10.299600
4 10.110104
5 9.791429
Clustering vector:
[1] 1 3 3 3 1 1 3 2 2 4 3 4 3 2 5 5 4 3 2 3 3 3 1 3 1 2 2 1 1 2 4 1 4 4 1 2 2
[38] 1 2 4 4 1 2 2 2 3 4 1 2 4 2 2 2 5 3 1 2 4 3 1 3 4 1 3 2 2 1 2 3 2 2 4 4 1
[75] 4 1 1 1 4 5 2 3 3 1 3 1 5 1 5 2 5 5 5 1 4 4 2 2 2 1 1 2 1 3 1 3 3 3 4 5 1
[112] 4 4 1 1 3 1 2 3 4 4 3 2 1 4 4 2 2 2 2 5 4 4 2 1 3 1 1 4 4 3 1 3 2 3 5 4 4
[149] 1 1 2 3 4 4 5 5 5 5 2 4 2 2 1 5 5 4 4 1 1 3 2 2 2 1 1 1 1 1 2 1 1 4 1 3 3
[186] 4 3 2 5 5 5 1 5 2 2 5 3 2 4 1 2 5 1 1 1 2 2 5 4 2 2 3 2 3 1 5 1 1 1 5 3 3
[223] 2 2 3 3 4 2 3 1 4 1 3 1 1 1 1 1 1 1 3 2 4 2 2 2 1 3 1 2 1 1 2 4 1 5 2 3 1
[260] 1 1 3 1 1 2 2 3 1 2 2 3 2 3 1 4 3 2 2 2 1 2 2 1 1 3 3 3 3 1 4 1 2 1 2 3 2
[297] 3 2 2 1 1 2 2 2 4 2 1 2 2 1 2 4 4 5 3 3 4 3 3 3 3 4 2 3 1 1 2 2 2 2 1 1 4
[334] 1 1 2 2 3 4 1 2 2 2 2 1 3 1 2 1 2 1 2 2 3 5 1 3 1 2 1 4 3 1 2 1 2 1 2 3 2
[371] 3 2 1 3 1 2 1 2 2 3 2 1 2 2 3 1 1 1 3 1 4 1 2 4 2 1 5 1 1 1 5 1 3 1 2 2 2
[408] 1 2 3 4 4 3 2 5 5 2 5 2 3 3 3 3 2 3 3 1 1 1 1 2 3 2 2 3 2 3 2 3 1 2 2 1 2
[445] 2 1 3 2 2 1 1 2 1 2 3 2 2 3 2 1 2 2 2 5 2 2 3 2 2 3 1 3 3 2 2 2 2 2 2 1 2
[482] 2 2 2 2 2 2 2 3 2 3 2 2 4 4 2 1 4 2 4 1 3 3 2 2 2 2 2 3 1 1 3 1 2 2 5 2 2
[519] 1 4 1 1 5 5 4 3 4 4 3 1 2 1 1 1 1 2 1 2 1 2 1 2 2 3 2 4 2 2 3 2 2 1 1 4 2
[556] 2 2 2 2 1 2 5 4 3 1 2 2 2 2 1 1 1 2 1 4 1 2 4 4 1 2 2 2 2 4 1 1 4 1 2 3 5
[593] 3 1 1 4 2 2 2 2 2 1 2 1 3 2 2 3 3 1 3 2 1 2 4 3 3 1 2 2 4 4 1 1 3 3 2 2 1
[630] 4 1 2 1 4 4 1 5 5 2 3 1 3 1 3 1 2 2 2 2 5 1 5 3 1 2 3 1 1 2 2 2 1 2 2 3 1
[667] 2 2 1 2 1 2 5 2 2 1 2 1 4 1 1 1 3 2 5 2 1 1 2 2 2 4 3 5 5 1 2 2 3 2 4 2 2
[704] 4 2 1 2 1 1 1 4 4 1 1 3 1 1 1 2 1 2 4 1 5 2 2 1 2 2 3 1 2 2 3 1 2 2 3 4 1
[741] 2 5 2 4 4 2 3 3 1 2 1 1 3 1 1 1 1 1 1 4 4 1 2 1 1 1 3 4 4 1 4 5 5 1 1 2 1
[778] 2 1 4 1 1 4 1 4 3 3 3 3 5 4 5 4 2 2 3 3 1 2 2 4 2 4 1 2 2 2 2 2 2 2 2 1 2
[815] 1 1 2 1 1 3 2 1 1 1 2 1 2 1 3 2 2 2 3 3 2 3 4 4 2 2 1 1 3 4 1 1 1 1 1 1 2
[852] 2 5 3 3 2 3 1 1 2 4 4 2 4 4 4 2 2 2 1 1 3 3 2 2 2 2 1 3 4 2 1 2 4 3 1 2 3
[889] 3 4 3 4 2 4 4 1 1 1 1 2 1 1 1 2 2 4 4 2 3 2 2 2 2 2 2 2 3 1 4 3 2 4 3 1 1
[926] 3 3 3 1 2 2 1 3 1 2 3 3 2 1 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 2
[963] 2 1 1 1 2 4 1 2 2 2 2 2 2 3 3 5 2 2 2 2 3 2 2 2 2 3 1 2 1 3 3 3 4 1 2 2 2
[1000] 2 2 3 2 2 1 2 2 2 2 3 2 2 1 1 2 2 2 4 4 2 2 2 2 2 2 1 1 3 3 2 2 2 2 2 1 2
[1037] 1 3 1 1 1 2 1 1 1 1 3 3 2 2 3 1 2 2 4 4 2 4 3 2 2 2 2 2 2 2 2 2 2 3 1 4 3
[1074] 1 4 3 2 2 2 5 2 5 3 3 3 3 2 1 1 1 3 2 2 1 2 2 2 2 2 2 1 2 1 2 2 1 2 2 2 3
[1111] 2 3 1 2 4 2 2 2 2 2 2 1 1 1 2 2 2 3 4 3 2 5 1 2 2 3 2 2 4 4 4 3 2 2 3 3 2
[1148] 2 2 2 2 3 1 1 3 1 4 4 3 1 2 2 2 2 2 1 3 3 1 2 2 1 2 3 3 3 1 2 4 1 1 2 3 3
[1185] 4 1 2 1 4 2 2 2 1 2 3 3 4 2 3 4 2 2 2 3 1 1 1 3 1 2 2 3 2 2 2 1 4 3 2 3 1
[1222] 1 4 2 2 3 1 2 4 3 3 4 3 1 2 4 2 2 2 2 3 3 1 4 5 1 1 1 1 1 1 3 2 2 1 1 3 3
[1259] 1 1 3 2 3 2 3 1 1 2 3 4 3 3 1 1 1 4 2 2 4 2 3 3 2 3 1 1 1 2 4 4 2 3 1 2 3
[1296] 3 3 1 2 2 1 3 2 3 3 4 4 1 4 3 4 2 1 3 3 4 1 1 4 3 4 1 2 3 1 1 1 1 1 4 4 3
[1333] 2 1 2 1 2 2 2 1 1 1 1 1 2 2 2 2 2 1 3 1 3 3 2 1 2 3 4 2 1 3 2 3 1 1 1 4 3
[1370] 2 3 2 3 4 1 4 1 3 2 1 1 2 4 4 4 4 2 2 1 4 1 1 2 3 3 2 2 4 2 2 5 5 2 2 1 1
[1407] 2 3 3 3 2 2 2 4 2 2 2 1 2 5 2 3 3 1 2 2 3 2 3 3 1 3 2 2 4 4 4 1 2 4 1 4 1
[1444] 2 4 4 1 1 3 1 1 2 3 4 2 2 4 4 2 2 3 2 1 1 3 3 3 1 3 1 2 2 1 1 4 4 4 4 2 2
[1481] 2 2 2 2 2 2 2 2 2 2 2 2 2 5 1 1 5 2 2 2 2 3 3 2 2 2 2 2 2 2 3 1 2 3 3 3 3
[1518] 1 2 2 1 2 3 3 1 1 1 2 3 3 2 2 2 4 1 1 1 1 2 3 1 1 1 2 2 1 2 2 1 2 2 1 3 1
[1555] 2 2 2 2 5 5 5 5 2 2 2 1 3 2 1 1 1 2 4 1 4 2 1 2 2 1 2 1 2 4 1 1 3 1 4 4 1
[1592] 2 1 1 1 3 1 1 1
Within cluster sum of squares by cluster:
[1] 37160.37 31518.58 45417.05 43785.88 63297.60
(between_SS / total_SS = 88.4 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault"
library(factoextra)
fviz_cluster(fitK, data = wine.pr)
i=1
k.optm=1
for (i in 1:28){
knn.mod <- knn(train=wine_train, test=wine_test, cl=wine_train$quality, k=i)
k.optm[i] <- 100 * sum(wine_test$quality == knn.mod)/NROW(wine_test$quality)
k=i
cat(k,'=',k.optm[i],'
')}
plot(k.optm, type="b", xlab="K- Value",ylab="Accuracy level")
1 = 71.09375 2 = 66.875 3 = 68.04688 4 = 66.95312 5 = 67.42188 6 = 65.78125 7 = 65.54688 8 = 66.01562 9 = 67.26562 10 = 66.09375 11 = 66.32812 12 = 65.78125 13 = 65.9375 14 = 66.5625 15 = 65.46875 16 = 65.9375 17 = 65.07812 18 = 65.3125 19 = 64.84375 20 = 64.92188 21 = 63.90625 22 = 63.82812 23 = 63.28125 24 = 63.67188 25 = 63.04688 26 = 62.5 27 = 62.42188 28 = 62.65625
kbest <- 3
knn <- knn(train=wine_train, test=wine_test, cl=wine_train$quality, k= kbest)
cm <- confusionMatrix(wine_test$quality, as.factor(knn))
print(cm)
print(paste('Accuracy = ', round(cm$overall['Accuracy'],3) ))
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 374 236
1 175 495
Accuracy : 0.6789
95% CI : (0.6525, 0.7044)
No Information Rate : 0.5711
P-Value [Acc > NIR] : 1.643e-15
Kappa : 0.3535
Mcnemar's Test P-Value : 0.003081
Sensitivity : 0.6812
Specificity : 0.6772
Pos Pred Value : 0.6131
Neg Pred Value : 0.7388
Prevalence : 0.4289
Detection Rate : 0.2922
Detection Prevalence : 0.4766
Balanced Accuracy : 0.6792
'Positive' Class : 0
[1] "Accuracy = 0.679"
# Linear Classifier
# To get graphical outputs
results_c <- matrix(rep(0, len=14), nrow = 2)
c_value = c(1, 2, 3, 5, 10, 20, 50) # chosen C values to select from
cl = length(c_value)
for (j in 1:cl) {
cj <- c_value[j]
results_c[1,j] <- cj
msvm <- svm(formula = quality ~ .,
data = wine_train,
type = 'C-classification',
kernel = 'linear',
cost = cj,
scaled = TRUE)
print(msvm) # display the model
# Model Prediction and Accuracy
pred <- predict(msvm,wine_test[,1:11])
acc = sum(pred == wine_test$quality) / nobs
results_c[2,j] <- acc
}
plot(results_c[1,], results_c[2,],main="Accuracy for C Values, Linear",
ylab = "Accuracy, %", xlab = "C", type = "l", col = "red")
c_best <- which.max(results_c[2,])
print(paste(' Best C = ', c_best, ' Accuracy = ', round(results_c[2,c_best],3)))
Call:
svm(formula = quality ~ ., data = wine_train, type = "C-classification",
kernel = "linear", cost = cj, scaled = TRUE)
Parameters:
SVM-Type: C-classification
SVM-Kernel: linear
cost: 1
gamma: 0.09090909
Number of Support Vectors: 174
Call:
svm(formula = quality ~ ., data = wine_train, type = "C-classification",
kernel = "linear", cost = cj, scaled = TRUE)
Parameters:
SVM-Type: C-classification
SVM-Kernel: linear
cost: 2
gamma: 0.09090909
Number of Support Vectors: 174
Call:
svm(formula = quality ~ ., data = wine_train, type = "C-classification",
kernel = "linear", cost = cj, scaled = TRUE)
Parameters:
SVM-Type: C-classification
SVM-Kernel: linear
cost: 3
gamma: 0.09090909
Number of Support Vectors: 175
Call:
svm(formula = quality ~ ., data = wine_train, type = "C-classification",
kernel = "linear", cost = cj, scaled = TRUE)
Parameters:
SVM-Type: C-classification
SVM-Kernel: linear
cost: 5
gamma: 0.09090909
Number of Support Vectors: 173
Call:
svm(formula = quality ~ ., data = wine_train, type = "C-classification",
kernel = "linear", cost = cj, scaled = TRUE)
Parameters:
SVM-Type: C-classification
SVM-Kernel: linear
cost: 10
gamma: 0.09090909
Number of Support Vectors: 174
Call:
svm(formula = quality ~ ., data = wine_train, type = "C-classification",
kernel = "linear", cost = cj, scaled = TRUE)
Parameters:
SVM-Type: C-classification
SVM-Kernel: linear
cost: 20
gamma: 0.09090909
Number of Support Vectors: 174
Call:
svm(formula = quality ~ ., data = wine_train, type = "C-classification",
kernel = "linear", cost = cj, scaled = TRUE)
Parameters:
SVM-Type: C-classification
SVM-Kernel: linear
cost: 50
gamma: 0.09090909
Number of Support Vectors: 173
[1] " Best C = 1 Accuracy = 0.593"
# To get graphical outputs
results_c <- matrix(rep(0, len=14), nrow = 2)
c_value = c(1, 2, 3, 5, 10, 20, 50) # chosen C values to select from
cl = length(c_value)
for (j in 1:cl) {
cj <- c_value[j]
results_c[1,j] <- cj
msvm <- svm(formula = quality ~ .,
data = wine_train,
type = 'C-classification',
kernel = 'radial',
cost = cj,
scaled = TRUE)
print(msvm) # display the model
# Model Prediction and Accuracy
pred <- predict(msvm,wine_test[,1:11])
prediction = sum(pred == wine_test$quality) / nobs
results_c[2,j] <- prediction
}
plot(results_c[1,], results_c[2,],main="Accuracy for C Values, Radial",
ylab = "Accuracy, %", xlab = "C", type = "l", col = "red")
c_best <- which.max(results_c[2,])
print(paste(' Best C = ', c_best, ' Accuracy = ', round(results_c[2,c_best],3)))
Call:
svm(formula = quality ~ ., data = wine_train, type = "C-classification",
kernel = "radial", cost = cj, scaled = TRUE)
Parameters:
SVM-Type: C-classification
SVM-Kernel: radial
cost: 1
gamma: 0.09090909
Number of Support Vectors: 219
Call:
svm(formula = quality ~ ., data = wine_train, type = "C-classification",
kernel = "radial", cost = cj, scaled = TRUE)
Parameters:
SVM-Type: C-classification
SVM-Kernel: radial
cost: 2
gamma: 0.09090909
Number of Support Vectors: 211
Call:
svm(formula = quality ~ ., data = wine_train, type = "C-classification",
kernel = "radial", cost = cj, scaled = TRUE)
Parameters:
SVM-Type: C-classification
SVM-Kernel: radial
cost: 3
gamma: 0.09090909
Number of Support Vectors: 209
Call:
svm(formula = quality ~ ., data = wine_train, type = "C-classification",
kernel = "radial", cost = cj, scaled = TRUE)
Parameters:
SVM-Type: C-classification
SVM-Kernel: radial
cost: 5
gamma: 0.09090909
Number of Support Vectors: 208
Call:
svm(formula = quality ~ ., data = wine_train, type = "C-classification",
kernel = "radial", cost = cj, scaled = TRUE)
Parameters:
SVM-Type: C-classification
SVM-Kernel: radial
cost: 10
gamma: 0.09090909
Number of Support Vectors: 208
Call:
svm(formula = quality ~ ., data = wine_train, type = "C-classification",
kernel = "radial", cost = cj, scaled = TRUE)
Parameters:
SVM-Type: C-classification
SVM-Kernel: radial
cost: 20
gamma: 0.09090909
Number of Support Vectors: 207
Call:
svm(formula = quality ~ ., data = wine_train, type = "C-classification",
kernel = "radial", cost = cj, scaled = TRUE)
Parameters:
SVM-Type: C-classification
SVM-Kernel: radial
cost: 50
gamma: 0.09090909
Number of Support Vectors: 202
[1] " Best C = 3 Accuracy = 0.601"
ord_mod <- polr(as.factor(quality) ~., data = wine.pr, Hess = TRUE)
summary(ord_mod)
Call:
polr(formula = as.factor(quality) ~ ., data = wine.pr, Hess = TRUE)
Coefficients:
Value Std. Error t value
fixed.acidity 0.10240 0.051209 2.000
volatile.acidity -3.41794 0.400103 -8.543
citric.acid -0.80494 0.462371 -1.741
residual.sugar 0.07617 0.038210 1.993
chlorides -5.17121 1.354373 -3.818
free.sulfur.dioxide 0.01392 0.006767 2.057
total.sulfur.dioxide -0.01119 0.002360 -4.744
density -48.92546 0.974488 -50.206
pH -0.98472 0.496900 -1.982
sulphates 2.86724 0.358016 8.009
alcohol 0.85611 0.059355 14.424
Intercepts:
Value Std. Error t value
3|4 -48.8787 0.9979 -48.9797
4|5 -46.9597 0.9959 -47.1541
5|6 -43.2452 0.9988 -43.2968
6|7 -40.3898 1.0111 -39.9454
7|8 -37.3837 1.0409 -35.9138
Residual Deviance: 3074.928
AIC: 3106.928
(ctable <- coef(summary(ord_mod)))
| Value | Std. Error | t value | |
|---|---|---|---|
| fixed.acidity | 0.10239667 | 0.051208703 | 1.999595 |
| volatile.acidity | -3.41794232 | 0.400103057 | -8.542655 |
| citric.acid | -0.80493954 | 0.462371339 | -1.740894 |
| residual.sugar | 0.07616957 | 0.038209940 | 1.993449 |
| chlorides | -5.17121245 | 1.354373478 | -3.818158 |
| free.sulfur.dioxide | 0.01392022 | 0.006767187 | 2.057016 |
| total.sulfur.dioxide | -0.01119452 | 0.002359517 | -4.744413 |
| density | -48.92545743 | 0.974488188 | -50.206311 |
| pH | -0.98471618 | 0.496900166 | -1.981718 |
| sulphates | 2.86723651 | 0.358015972 | 8.008683 |
| alcohol | 0.85611493 | 0.059354809 | 14.423683 |
| 3|4 | -48.87868292 | 0.997938359 | -48.979661 |
| 4|5 | -46.95972294 | 0.995878057 | -47.154089 |
| 5|6 | -43.24521407 | 0.998809085 | -43.296777 |
| 6|7 | -40.38977373 | 1.011124776 | -39.945390 |
| 7|8 | -37.38369205 | 1.040928295 | -35.913801 |
## calculate and store p values
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
## combined table
(ctable <- cbind(ctable, "p value" = p))
| Value | Std. Error | t value | p value | |
|---|---|---|---|---|
| fixed.acidity | 0.10239667 | 0.051208703 | 1.999595 | 4.554400e-02 |
| volatile.acidity | -3.41794232 | 0.400103057 | -8.542655 | 1.311728e-17 |
| citric.acid | -0.80493954 | 0.462371339 | -1.740894 | 8.170214e-02 |
| residual.sugar | 0.07616957 | 0.038209940 | 1.993449 | 4.621229e-02 |
| chlorides | -5.17121245 | 1.354373478 | -3.818158 | 1.344516e-04 |
| free.sulfur.dioxide | 0.01392022 | 0.006767187 | 2.057016 | 3.968465e-02 |
| total.sulfur.dioxide | -0.01119452 | 0.002359517 | -4.744413 | 2.091120e-06 |
| density | -48.92545743 | 0.974488188 | -50.206311 | 0.000000e+00 |
| pH | -0.98471618 | 0.496900166 | -1.981718 | 4.751077e-02 |
| sulphates | 2.86723651 | 0.358015972 | 8.008683 | 1.159431e-15 |
| alcohol | 0.85611493 | 0.059354809 | 14.423683 | 3.672145e-47 |
| 3|4 | -48.87868292 | 0.997938359 | -48.979661 | 0.000000e+00 |
| 4|5 | -46.95972294 | 0.995878057 | -47.154089 | 0.000000e+00 |
| 5|6 | -43.24521407 | 0.998809085 | -43.296777 | 0.000000e+00 |
| 6|7 | -40.38977373 | 1.011124776 | -39.945390 | 0.000000e+00 |
| 7|8 | -37.38369205 | 1.040928295 | -35.913801 | 1.860313e-282 |
#(ci <- confint(ord_mod)) # default method gives profiled CIs
confint.default(ord_mod) # CIs assuming normality
# If the move from 2.5% to 97.% includes a zero (a sign change) then this coefficient is NOT statistically significant
| 2.5 % | 97.5 % | |
|---|---|---|
| fixed.acidity | 2.029459e-03 | 0.202763888 |
| volatile.acidity | -4.202130e+00 | -2.633754738 |
| citric.acid | -1.711171e+00 | 0.101291629 |
| residual.sugar | 1.279467e-03 | 0.151059678 |
| chlorides | -7.825736e+00 | -2.516689216 |
| free.sulfur.dioxide | 6.567716e-04 | 0.027183659 |
| total.sulfur.dioxide | -1.581909e-02 | -0.006569955 |
| density | -5.083542e+01 | -47.015495682 |
| pH | -1.958623e+00 | -0.010809751 |
| sulphates | 2.165538e+00 | 3.568934919 |
| alcohol | 7.397816e-01 | 0.972448215 |
## odds ratios - These coefficients are called proportional odds ratios
#exp(cbind(OR = coef(ord_mod), ci))
# Let's rerun the regression with only significant predictors
ord_mods <- polr(as.factor(quality) ~. - residual.sugar - fixed.acidity - density - citric.acid - pH,
data = wine.pr, Hess = TRUE)
summary(ord_mods)
Call:
polr(formula = as.factor(quality) ~ . - residual.sugar - fixed.acidity -
density - citric.acid - pH, data = wine.pr, Hess = TRUE)
Coefficients:
Value Std. Error t value
volatile.acidity -3.37758 0.321717 -10.499
chlorides -4.83975 1.254663 -3.857
free.sulfur.dioxide 0.01323 0.006547 2.020
total.sulfur.dioxide -0.01121 0.002244 -4.995
sulphates 2.82640 0.351570 8.039
alcohol 0.84021 0.056315 14.920
Intercepts:
Value Std. Error t value
3|4 2.1890 0.7326 2.9879
4|5 4.1037 0.6710 6.1155
5|6 7.8113 0.6687 11.6815
6|7 10.6328 0.7052 15.0780
7|8 13.6276 0.7596 17.9400
Residual Deviance: 3093.405
AIC: 3115.405
(ctable1 <- coef(summary(ord_mods)))
| Value | Std. Error | t value | |
|---|---|---|---|
| volatile.acidity | -3.37757880 | 0.321717050 | -10.498601 |
| chlorides | -4.83975357 | 1.254662677 | -3.857414 |
| free.sulfur.dioxide | 0.01322711 | 0.006546880 | 2.020369 |
| total.sulfur.dioxide | -0.01120806 | 0.002244033 | -4.994603 |
| sulphates | 2.82640284 | 0.351569894 | 8.039377 |
| alcohol | 0.84021340 | 0.056314704 | 14.919965 |
| 3|4 | 2.18899507 | 0.732615107 | 2.987920 |
| 4|5 | 4.10370205 | 0.671028839 | 6.115538 |
| 5|6 | 7.81126965 | 0.668686436 | 11.681514 |
| 6|7 | 10.63279309 | 0.705185658 | 15.078005 |
| 7|8 | 13.62760849 | 0.759622299 | 17.939980 |
## calculate and store p values
p <- pnorm(abs(ctable1[, "t value"]), lower.tail = FALSE) * 2
## combined table
(ctable1 <- cbind(ctable1, "p value" = p))
| Value | Std. Error | t value | p value | |
|---|---|---|---|---|
| volatile.acidity | -3.37757880 | 0.321717050 | -10.498601 | 8.767017e-26 |
| chlorides | -4.83975357 | 1.254662677 | -3.857414 | 1.145929e-04 |
| free.sulfur.dioxide | 0.01322711 | 0.006546880 | 2.020369 | 4.334511e-02 |
| total.sulfur.dioxide | -0.01120806 | 0.002244033 | -4.994603 | 5.895691e-07 |
| sulphates | 2.82640284 | 0.351569894 | 8.039377 | 9.029653e-16 |
| alcohol | 0.84021340 | 0.056314704 | 14.919965 | 2.444024e-50 |
| 3|4 | 2.18899507 | 0.732615107 | 2.987920 | 2.808834e-03 |
| 4|5 | 4.10370205 | 0.671028839 | 6.115538 | 9.623203e-10 |
| 5|6 | 7.81126965 | 0.668686436 | 11.681514 | 1.584487e-31 |
| 6|7 | 10.63279309 | 0.705185658 | 15.078005 | 2.259940e-51 |
| 7|8 | 13.62760849 | 0.759622299 | 17.939980 | 5.747631e-72 |
#(ci1 <- confint(ord_mods)) # default method gives profiled CIs
ci1 <- confint.default(ord_mods) # CIs assuming normality
ci1
# All predictors are significant
| 2.5 % | 97.5 % | |
|---|---|---|
| volatile.acidity | -4.0081326322 | -2.747024970 |
| chlorides | -7.2988472355 | -2.380659914 |
| free.sulfur.dioxide | 0.0003954655 | 0.026058764 |
| total.sulfur.dioxide | -0.0156062800 | -0.006809831 |
| sulphates | 2.1373385141 | 3.515467175 |
| alcohol | 0.7298386114 | 0.950588196 |
## odds ratios - These coefficients are called proportional odds ratios
exp(cbind(OR = coef(ord_mods), ci1))
| OR | 2.5 % | 97.5 % | |
|---|---|---|---|
| volatile.acidity | 0.034129990 | 0.018167289 | 0.06411833 |
| chlorides | 0.007909003 | 0.000676318 | 0.09248952 |
| free.sulfur.dioxide | 1.013314980 | 1.000395544 | 1.02640126 |
| total.sulfur.dioxide | 0.988854521 | 0.984514867 | 0.99321330 |
| sulphates | 16.884614873 | 8.476846572 | 33.63163613 |
| alcohol | 2.316861351 | 2.074745740 | 2.58723101 |
# Proportional Odds
sf <- function(y) {
c('Y>=0' = qlogis(mean(y >= 0)),
'Y>=1' = qlogis(mean(y >= 1)),
'Y>=2' = qlogis(mean(y >= 2)),
'Y>=3' = qlogis(mean(y >= 3)),
'Y>=4' = qlogis(mean(y >= 4)),
'Y>=6' = qlogis(mean(y >= 6)),
'Y>=7' = qlogis(mean(y >= 7)),
'Y>=8' = qlogis(mean(y >= 8)),
'Y>=9' = qlogis(mean(y >= 9)))
}
(s <- with(wine, summary(as.numeric(quality) ~ fixed.acidity + volatile.acidity + residual.sugar +
free.sulfur.dioxide + density + pH + sulphates + alcohol, fun=sf)))
as.numeric(quality) N= 1599 +-------------------+-------------+----+----+----+------------+----+----+----+----+----+----+ | | |N |Y>=0|Y>=1|Y>=2 |Y>=3|Y>=4|Y>=6|Y>=7|Y>=8|Y>=9| +-------------------+-------------+----+----+----+------------+----+----+----+----+----+----+ |fixed.acidity |[4.6, 7.2) | 419|Inf |Inf | 0.109895671|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[7.2, 8.0) | 397|Inf |Inf |-0.136230450|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[8.0, 9.3) | 387|Inf |Inf | 0.036179657|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[9.3,15.9] | 396|Inf |Inf | 0.559615788|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| +-------------------+-------------+----+----+----+------------+----+----+----+----+----+----+ |volatile.acidity |[0.120,0.395)| 406|Inf |Inf | 1.158630298|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[0.395,0.530)| 410|Inf |Inf | 0.304776506|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[0.530,0.645)| 407|Inf |Inf |-0.202159899|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[0.645,1.580]| 376|Inf |Inf |-0.673265810|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| +-------------------+-------------+----+----+----+------------+----+----+----+----+----+----+ |residual.sugar |[0.90, 2.00) | 464|Inf |Inf | 0.051735674|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[2.00, 2.25) | 419|Inf |Inf | 0.205975748|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[2.25, 2.65) | 361|Inf |Inf | 0.205708489|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[2.65,15.50] | 355|Inf |Inf | 0.107144637|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| +-------------------+-------------+----+----+----+------------+----+----+----+----+----+----+ |free.sulfur.dioxide|[ 1, 8) | 408|Inf |Inf | 0.296265816|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[ 8,15) | 438|Inf |Inf | 0.164755233|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[15,22) | 355|Inf |Inf | 0.118448150|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[22,72] | 398|Inf |Inf |-0.030153038|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| +-------------------+-------------+----+----+----+------------+----+----+----+----+----+----+ |density |[0.990,0.996)| 407|Inf |Inf | 1.005745604|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[0.996,0.997)| 396|Inf |Inf |-0.161969328|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[0.997,0.998)| 397|Inf |Inf |-0.237891408|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[0.998,1.004]| 399|Inf |Inf | 0.005012542|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| +-------------------+-------------+----+----+----+------------+----+----+----+----+----+----+ |pH |[2.74,3.22) | 424|Inf |Inf | 0.113328685|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[3.22,3.32) | 398|Inf |Inf | 0.130839601|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[3.32,3.41) | 390|Inf |Inf | 0.226605759|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[3.41,4.01] | 387|Inf |Inf | 0.087911872|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| +-------------------+-------------+----+----+----+------------+----+----+----+----+----+----+ |sulphates |[0.33,0.56) | 420|Inf |Inf |-0.881547783|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[0.56,0.63) | 409|Inf |Inf |-0.024451096|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[0.63,0.74) | 384|Inf |Inf | 0.566784277|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[0.74,2.00] | 386|Inf |Inf | 1.037368663|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| +-------------------+-------------+----+----+----+------------+----+----+----+----+----+----+ |alcohol |[ 8.40, 9.55)| 436|Inf |Inf |-0.922721622|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[ 9.55,10.30)| 406|Inf |Inf |-0.399311243|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[10.30,11.20)| 377|Inf |Inf | 0.583662948|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| | |[11.20,14.90]| 380|Inf |Inf | 1.797913335|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| +-------------------+-------------+----+----+----+------------+----+----+----+----+----+----+ |Overall | |1599|Inf |Inf | 0.139060434|-Inf|-Inf|-Inf|-Inf|-Inf|-Inf| +-------------------+-------------+----+----+----+------------+----+----+----+----+----+----+
# Example
glm(I(as.numeric(quality) >= 4) ~ alcohol, family ="binomial", data = wine)
Warning message: “glm.fit: algorithm did not converge”
Call: glm(formula = I(as.numeric(quality) >= 4) ~ alcohol, family = "binomial",
data = wine)
Coefficients:
(Intercept) alcohol
-2.657e+01 -7.662e-17
Degrees of Freedom: 1598 Total (i.e. Null); 1597 Residual
Null Deviance: 0
Residual Deviance: 9.277e-09 AIC: 4
#s[, 4] <- s[, 4] - s[, 3]
#s[, 3] <- s[, 3] - s[, 3]
#s
glm(I(as.numeric(quality) >= 7) ~ alcohol, family ="binomial", data = wine)
Warning message: “glm.fit: algorithm did not converge”
Call: glm(formula = I(as.numeric(quality) >= 7) ~ alcohol, family = "binomial",
data = wine)
Coefficients:
(Intercept) alcohol
-2.657e+01 -7.662e-17
Degrees of Freedom: 1598 Total (i.e. Null); 1597 Residual
Null Deviance: 0
Residual Deviance: 9.277e-09 AIC: 4
rf = randomForest(quality ~ .,
data=wine_train,
importance=TRUE, ntree=501, confusion=TRUE, err.rate=TRUE,
parms=list(split="gini"), proximity = TRUE)
print(rf)
varImpPlot(rf, main = "Original Random Forest")
plot(rf, main = 'Original Random Forest')
rfortr <- predict(rf, newdata = wine_train, type = "class")
rforts <- predict(rf, newdata = wine_test, type = "class")
#Confusion Matrix
# Train
cmrftr <- confusionMatrix(as.factor(rfortr), wine_train$quality)
print(cmrftr)
accrftr <- cmrftr$overall["Accuracy"]
print(paste('RF Original Train Error = ', round(1 - accrftr['Accuracy'],4)))
# Test
cmrfts <- confusionMatrix(as.factor(rforts), wine_test$quality)
print(cmrfts)
accrfts <- cmrfts$overall["Accuracy"]
print(paste('RF Original Test Error = ', round(1 - accrfts['Accuracy'],4)))
#Optimization mtry
mtry <- tuneRF(wine_train[,1:11],wine_train$quality, ntreeTry=1000,
stepFactor=1.5,improve=0.01, trace=TRUE, plot=TRUE)
best.m <- mtry[mtry[, 2] == min(mtry[, 2]), 1]
print(mtry)
print(paste('Best mtry = ', best.m))
rfb <-randomForest(quality ~ .,
data=wine_train,
importance=TRUE, ntree=1000, mtry= best.m,
confusion=TRUE, err.rate=TRUE, parms=list(split="gini"),
proximity = TRUE)
print(rfb)
varImpPlot(rfb, main = "Optimized Random Forest", sort = TRUE)
plot(rfb, main = 'Optimized Random Forest')
hist(treesize(rfb),
main = "No. of Nodes for the Trees - Optimized RandomForest",
col = "green")
Call:
randomForest(formula = quality ~ ., data = wine_train, importance = TRUE, ntree = 501, confusion = TRUE, err.rate = TRUE, parms = list(split = "gini"), proximity = TRUE)
Type of random forest: classification
Number of trees: 501
No. of variables tried at each split: 3
OOB estimate of error rate: 23.82%
Confusion matrix:
0 1 class.error
0 92 42 0.3134328
1 34 151 0.1837838
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 134 0
1 0 185
Accuracy : 1
95% CI : (0.9885, 1)
No Information Rate : 0.5799
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 1
Mcnemar's Test P-Value : NA
Sensitivity : 1.0000
Specificity : 1.0000
Pos Pred Value : 1.0000
Neg Pred Value : 1.0000
Prevalence : 0.4201
Detection Rate : 0.4201
Detection Prevalence : 0.4201
Balanced Accuracy : 1.0000
'Positive' Class : 0
[1] "RF Original Train Error = 0"
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 415 109
1 195 561
Accuracy : 0.7625
95% CI : (0.7382, 0.7856)
No Information Rate : 0.5234
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5209
Mcnemar's Test P-Value : 1.088e-06
Sensitivity : 0.6803
Specificity : 0.8373
Pos Pred Value : 0.7920
Neg Pred Value : 0.7421
Prevalence : 0.4766
Detection Rate : 0.3242
Detection Prevalence : 0.4094
Balanced Accuracy : 0.7588
'Positive' Class : 0
[1] "RF Original Test Error = 0.2375"
mtry = 3 OOB error = 25.08%
Searching left ...
mtry = 2 OOB error = 23.82%
0.05 0.01
Searching right ...
mtry = 4 OOB error = 25.39%
-0.06578947 0.01
mtry OOBError
2.OOB 2 0.2382445
3.OOB 3 0.2507837
4.OOB 4 0.2539185
[1] "Best mtry = 2"
Call:
randomForest(formula = quality ~ ., data = wine_train, importance = TRUE, ntree = 1000, mtry = best.m, confusion = TRUE, err.rate = TRUE, parms = list(split = "gini"), proximity = TRUE)
Type of random forest: classification
Number of trees: 1000
No. of variables tried at each split: 2
OOB estimate of error rate: 24.45%
Confusion matrix:
0 1 class.error
0 88 46 0.3432836
1 32 153 0.1729730
#Optimized RF
rforoptr <- predict(rfb, newdata = wine_train, type = "class")
rforopts <- predict(rfb, newdata = wine_test, type = "class")
# Train
cmrfbtr <- confusionMatrix(as.factor(rforoptr), wine_train$quality)
print(cmrfbtr)
accrfbtr <- cmrfbtr$overall["Accuracy"]
print(paste('RF Optimized Train Error = ', round(1 - accrfbtr['Accuracy'],4)))
# Test
cmrfbts <- confusionMatrix(as.factor(rforopts), wine_test$quality)
print(cmrfbts)
accrfbts <- cmrfbts$overall["Accuracy"]
print(paste('RF Optimized Test Error = ', round(1 - accrfbts['Accuracy'],4)))
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 134 0
1 0 185
Accuracy : 1
95% CI : (0.9885, 1)
No Information Rate : 0.5799
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 1
Mcnemar's Test P-Value : NA
Sensitivity : 1.0000
Specificity : 1.0000
Pos Pred Value : 1.0000
Neg Pred Value : 1.0000
Prevalence : 0.4201
Detection Rate : 0.4201
Detection Prevalence : 0.4201
Balanced Accuracy : 1.0000
'Positive' Class : 0
[1] "RF Optimized Train Error = 0"
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 405 108
1 205 562
Accuracy : 0.7555
95% CI : (0.731, 0.7788)
No Information Rate : 0.5234
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5063
Mcnemar's Test P-Value : 5.755e-08
Sensitivity : 0.6639
Specificity : 0.8388
Pos Pred Value : 0.7895
Neg Pred Value : 0.7327
Prevalence : 0.4766
Detection Rate : 0.3164
Detection Prevalence : 0.4008
Balanced Accuracy : 0.7514
'Positive' Class : 0
[1] "RF Optimized Test Error = 0.2445"
control <- trainControl(method='repeatedcv',
number=10,
repeats=3,
search = 'random')
rf_random <- train(quality ~ .,
data = wine_train,
method = 'rf',
metric = 'Accuracy',
tuneLength = 15,
trControl = control)
print(rf_random)
Random Forest 319 samples 11 predictor 2 classes: '0', '1' No pre-processing Resampling: Cross-Validated (10 fold, repeated 3 times) Summary of sample sizes: 287, 287, 288, 287, 287, 288, ... Resampling results across tuning parameters: mtry Accuracy Kappa 3 0.7471571 0.4776934 4 0.7419762 0.4670078 5 0.7430199 0.4696736 6 0.7400232 0.4641613 8 0.7409009 0.4654599 9 0.7357578 0.4547457 10 0.7305097 0.4440194 Accuracy was used to select the optimal model using the largest value. The final value used for the model was mtry = 3.
importance(rfb)
| 0 | 1 | MeanDecreaseAccuracy | MeanDecreaseGini | |
|---|---|---|---|---|
| fixed.acidity | 8.148865 | 9.334083 | 12.585291 | 11.430618 |
| volatile.acidity | 16.452604 | 10.401976 | 18.894120 | 16.196699 |
| citric.acid | 4.230159 | 7.862547 | 9.533028 | 11.048381 |
| residual.sugar | 6.055490 | 1.233295 | 5.288295 | 9.826238 |
| chlorides | 8.080663 | 6.039082 | 10.133803 | 12.962397 |
| free.sulfur.dioxide | 4.408554 | 1.527612 | 4.251549 | 9.437758 |
| total.sulfur.dioxide | 11.953963 | 8.535855 | 14.664830 | 14.798376 |
| density | 9.944922 | 10.892481 | 15.256074 | 14.155264 |
| pH | 2.953683 | 8.740439 | 8.753233 | 11.335217 |
| sulphates | 20.344905 | 20.638785 | 27.375435 | 19.674805 |
| alcohol | 30.500449 | 22.294560 | 34.340397 | 24.105379 |
dtnp = rpart(quality ~ ., data=wine_train,
method = "class",
model = TRUE, xval = 10, parms=list(split="gini"))
print(summary(dtnp))
#Redistribution Error and R-squared
#Training
prednp <- predict(dtnp, newdata = wine_train, type = "class")
mcnp <- table(wine_train$quality, prednp)
err.resub <- round(1.0 - (mcnp[1,1] + mcnp[2,2])/sum(mcnp),3)
print("Non-pruned Tree without CV")
print(paste('Training Redistribution Error = ', err.resub))
printcp(dtnp)
plotcp(dtnp)
rpart.plot(dtnp, yesno = TRUE, extra = 106)
fancyRpartPlot(dtnp, uniform=TRUE, main="Non-Pruned Classification Tree")
#Confusion Matrix
cmdt <- confusionMatrix(as.factor(prednp), wine_train$quality)
accdt <- cmdt$overall["Accuracy"]
print(paste('LG CV Train Error = ', round(1 - accdt['Accuracy'],4)))
Call:
rpart(formula = quality ~ ., data = wine_train, method = "class",
model = TRUE, parms = list(split = "gini"), xval = 10)
n= 319
CP nsplit rel error xerror xstd
1 0.35820896 0 1.0000000 1.0000000 0.06578670
2 0.07462687 1 0.6417910 0.6417910 0.05914623
3 0.04477612 2 0.5671642 0.6194030 0.05847832
4 0.01741294 3 0.5223881 0.6194030 0.05847832
5 0.01119403 7 0.4402985 0.6865672 0.06038189
6 0.01000000 13 0.3432836 0.7164179 0.06113472
Variable importance
alcohol volatile.acidity total.sulfur.dioxide
27 17 13
fixed.acidity density citric.acid
10 7 6
sulphates free.sulfur.dioxide pH
6 5 4
chlorides residual.sugar
3 2
Node number 1: 319 observations, complexity param=0.358209
predicted class=1 expected loss=0.4200627 P(node) =1
class counts: 134 185
probabilities: 0.420 0.580
left son=2 (132 obs) right son=3 (187 obs)
Primary splits:
alcohol < 9.95 to the left, improve=30.85635, (0 missing)
sulphates < 0.645 to the left, improve=20.91785, (0 missing)
volatile.acidity < 0.535 to the right, improve=15.71795, (0 missing)
total.sulfur.dioxide < 56.5 to the right, improve=14.64462, (0 missing)
density < 0.995575 to the right, improve=11.50913, (0 missing)
Surrogate splits:
total.sulfur.dioxide < 58.5 to the right, agree=0.677, adj=0.220, (0 split)
sulphates < 0.595 to the left, agree=0.649, adj=0.152, (0 split)
volatile.acidity < 0.5825 to the right, agree=0.639, adj=0.129, (0 split)
density < 0.996595 to the right, agree=0.636, adj=0.121, (0 split)
citric.acid < 0.315 to the left, agree=0.605, adj=0.045, (0 split)
Node number 2: 132 observations, complexity param=0.07462687
predicted class=0 expected loss=0.3181818 P(node) =0.4137931
class counts: 90 42
probabilities: 0.682 0.318
left son=4 (114 obs) right son=5 (18 obs)
Primary splits:
fixed.acidity < 9.95 to the left, improve=8.804891, (0 missing)
total.sulfur.dioxide < 51.5 to the right, improve=7.441344, (0 missing)
volatile.acidity < 0.535 to the right, improve=5.672727, (0 missing)
sulphates < 0.625 to the left, improve=5.522727, (0 missing)
free.sulfur.dioxide < 8.5 to the right, improve=3.594296, (0 missing)
Surrogate splits:
citric.acid < 0.445 to the left, agree=0.924, adj=0.444, (0 split)
density < 0.99978 to the left, agree=0.902, adj=0.278, (0 split)
volatile.acidity < 0.325 to the right, agree=0.879, adj=0.111, (0 split)
pH < 3.045 to the right, agree=0.879, adj=0.111, (0 split)
residual.sugar < 6.05 to the left, agree=0.871, adj=0.056, (0 split)
Node number 3: 187 observations, complexity param=0.04477612
predicted class=1 expected loss=0.2352941 P(node) =0.5862069
class counts: 44 143
probabilities: 0.235 0.765
left son=6 (8 obs) right son=7 (179 obs)
Primary splits:
volatile.acidity < 0.79 to the right, improve=6.840207, (0 missing)
sulphates < 0.675 to the left, improve=6.259504, (0 missing)
alcohol < 11.35 to the left, improve=5.215533, (0 missing)
pH < 3.485 to the right, improve=2.745638, (0 missing)
chlorides < 0.0755 to the right, improve=2.311075, (0 missing)
Node number 4: 114 observations, complexity param=0.01741294
predicted class=0 expected loss=0.245614 P(node) =0.3573668
class counts: 86 28
probabilities: 0.754 0.246
left son=8 (49 obs) right son=9 (65 obs)
Primary splits:
total.sulfur.dioxide < 51.5 to the right, improve=4.621752, (0 missing)
pH < 3.305 to the left, improve=4.019657, (0 missing)
chlorides < 0.0815 to the right, improve=2.976585, (0 missing)
volatile.acidity < 0.535 to the right, improve=2.535088, (0 missing)
free.sulfur.dioxide < 30.5 to the right, improve=1.925614, (0 missing)
Surrogate splits:
free.sulfur.dioxide < 20.5 to the right, agree=0.772, adj=0.469, (0 split)
residual.sugar < 2.45 to the right, agree=0.693, adj=0.286, (0 split)
citric.acid < 0.255 to the right, agree=0.649, adj=0.184, (0 split)
pH < 3.285 to the left, agree=0.632, adj=0.143, (0 split)
chlorides < 0.1015 to the right, agree=0.623, adj=0.122, (0 split)
Node number 5: 18 observations
predicted class=1 expected loss=0.2222222 P(node) =0.05642633
class counts: 4 14
probabilities: 0.222 0.778
Node number 6: 8 observations
predicted class=0 expected loss=0.125 P(node) =0.02507837
class counts: 7 1
probabilities: 0.875 0.125
Node number 7: 179 observations, complexity param=0.01119403
predicted class=1 expected loss=0.2067039 P(node) =0.5611285
class counts: 37 142
probabilities: 0.207 0.793
left son=14 (104 obs) right son=15 (75 obs)
Primary splits:
alcohol < 11.15 to the left, improve=5.062885, (0 missing)
sulphates < 0.665 to the left, improve=4.656691, (0 missing)
volatile.acidity < 0.395 to the right, improve=2.618035, (0 missing)
density < 0.99517 to the right, improve=2.227632, (0 missing)
pH < 3.485 to the right, improve=1.579728, (0 missing)
Surrogate splits:
density < 0.99479 to the right, agree=0.782, adj=0.480, (0 split)
chlorides < 0.0635 to the right, agree=0.670, adj=0.213, (0 split)
fixed.acidity < 6.65 to the right, agree=0.642, adj=0.147, (0 split)
total.sulfur.dioxide < 18.5 to the right, agree=0.626, adj=0.107, (0 split)
pH < 3.445 to the left, agree=0.620, adj=0.093, (0 split)
Node number 8: 49 observations
predicted class=0 expected loss=0.08163265 P(node) =0.153605
class counts: 45 4
probabilities: 0.918 0.082
Node number 9: 65 observations, complexity param=0.01741294
predicted class=0 expected loss=0.3692308 P(node) =0.2037618
class counts: 41 24
probabilities: 0.631 0.369
left son=18 (20 obs) right son=19 (45 obs)
Primary splits:
pH < 3.295 to the left, improve=2.776923, (0 missing)
chlorides < 0.0795 to the right, improve=2.759682, (0 missing)
volatile.acidity < 0.6725 to the right, improve=2.042881, (0 missing)
total.sulfur.dioxide < 44 to the left, improve=1.889718, (0 missing)
density < 0.9957 to the right, improve=1.868056, (0 missing)
Surrogate splits:
fixed.acidity < 8.15 to the right, agree=0.800, adj=0.35, (0 split)
citric.acid < 0.205 to the right, agree=0.769, adj=0.25, (0 split)
chlorides < 0.1 to the right, agree=0.754, adj=0.20, (0 split)
volatile.acidity < 0.35 to the left, agree=0.738, adj=0.15, (0 split)
sulphates < 0.855 to the right, agree=0.723, adj=0.10, (0 split)
Node number 14: 104 observations, complexity param=0.01119403
predicted class=1 expected loss=0.3076923 P(node) =0.3260188
class counts: 32 72
probabilities: 0.308 0.692
left son=28 (67 obs) right son=29 (37 obs)
Primary splits:
sulphates < 0.69 to the left, improve=3.420238, (0 missing)
volatile.acidity < 0.395 to the right, improve=2.563548, (0 missing)
pH < 3.48 to the right, improve=2.481477, (0 missing)
chlorides < 0.0915 to the right, improve=2.458008, (0 missing)
density < 0.99955 to the right, improve=1.210616, (0 missing)
Surrogate splits:
volatile.acidity < 0.405 to the right, agree=0.769, adj=0.351, (0 split)
fixed.acidity < 8.85 to the left, agree=0.731, adj=0.243, (0 split)
density < 0.9989 to the left, agree=0.731, adj=0.243, (0 split)
citric.acid < 0.465 to the left, agree=0.721, adj=0.216, (0 split)
total.sulfur.dioxide < 64.5 to the left, agree=0.673, adj=0.081, (0 split)
Node number 15: 75 observations
predicted class=1 expected loss=0.06666667 P(node) =0.2351097
class counts: 5 70
probabilities: 0.067 0.933
Node number 18: 20 observations
predicted class=0 expected loss=0.15 P(node) =0.06269592
class counts: 17 3
probabilities: 0.850 0.150
Node number 19: 45 observations, complexity param=0.01741294
predicted class=0 expected loss=0.4666667 P(node) =0.1410658
class counts: 24 21
probabilities: 0.533 0.467
left son=38 (14 obs) right son=39 (31 obs)
Primary splits:
volatile.acidity < 0.6725 to the right, improve=4.261751, (0 missing)
citric.acid < 0.055 to the left, improve=4.114286, (0 missing)
sulphates < 0.535 to the left, improve=3.869828, (0 missing)
total.sulfur.dioxide < 23.5 to the left, improve=2.588940, (0 missing)
alcohol < 9.35 to the left, improve=2.034615, (0 missing)
Surrogate splits:
total.sulfur.dioxide < 14.5 to the left, agree=0.778, adj=0.286, (0 split)
fixed.acidity < 8.05 to the right, agree=0.756, adj=0.214, (0 split)
free.sulfur.dioxide < 4.5 to the left, agree=0.756, adj=0.214, (0 split)
sulphates < 0.445 to the left, agree=0.733, adj=0.143, (0 split)
citric.acid < 0.01 to the left, agree=0.711, adj=0.071, (0 split)
Node number 28: 67 observations, complexity param=0.01119403
predicted class=1 expected loss=0.4029851 P(node) =0.2100313
class counts: 27 40
probabilities: 0.403 0.597
left son=56 (57 obs) right son=57 (10 obs)
Primary splits:
fixed.acidity < 6.85 to the right, improve=2.158104, (0 missing)
chlorides < 0.0915 to the right, improve=2.072139, (0 missing)
citric.acid < 0.075 to the right, improve=1.663237, (0 missing)
volatile.acidity < 0.4725 to the left, improve=1.559631, (0 missing)
density < 0.995195 to the right, improve=1.057854, (0 missing)
Surrogate splits:
density < 0.99504 to the right, agree=0.881, adj=0.2, (0 split)
Node number 29: 37 observations
predicted class=1 expected loss=0.1351351 P(node) =0.1159875
class counts: 5 32
probabilities: 0.135 0.865
Node number 38: 14 observations
predicted class=0 expected loss=0.1428571 P(node) =0.04388715
class counts: 12 2
probabilities: 0.857 0.143
Node number 39: 31 observations, complexity param=0.01741294
predicted class=1 expected loss=0.3870968 P(node) =0.09717868
class counts: 12 19
probabilities: 0.387 0.613
left son=78 (8 obs) right son=79 (23 obs)
Primary splits:
alcohol < 9.35 to the left, improve=2.840112, (0 missing)
residual.sugar < 1.85 to the left, improve=1.982405, (0 missing)
density < 0.99725 to the right, improve=1.507923, (0 missing)
citric.acid < 0.05 to the left, improve=0.855132, (0 missing)
free.sulfur.dioxide < 11 to the left, improve=0.855132, (0 missing)
Surrogate splits:
citric.acid < 0.195 to the right, agree=0.774, adj=0.125, (0 split)
residual.sugar < 1.65 to the left, agree=0.774, adj=0.125, (0 split)
chlorides < 0.0955 to the right, agree=0.774, adj=0.125, (0 split)
total.sulfur.dioxide < 16.5 to the left, agree=0.774, adj=0.125, (0 split)
Node number 56: 57 observations, complexity param=0.01119403
predicted class=1 expected loss=0.4561404 P(node) =0.1786834
class counts: 26 31
probabilities: 0.456 0.544
left son=112 (22 obs) right son=113 (35 obs)
Primary splits:
volatile.acidity < 0.4725 to the left, improve=2.327455, (0 missing)
chlorides < 0.0925 to the right, improve=1.878604, (0 missing)
fixed.acidity < 7.45 to the left, improve=1.804511, (0 missing)
pH < 3.335 to the right, improve=1.794768, (0 missing)
citric.acid < 0.075 to the right, improve=1.461654, (0 missing)
Surrogate splits:
citric.acid < 0.415 to the right, agree=0.789, adj=0.455, (0 split)
total.sulfur.dioxide < 9.5 to the left, agree=0.684, adj=0.182, (0 split)
pH < 3.465 to the right, agree=0.684, adj=0.182, (0 split)
residual.sugar < 5.2 to the right, agree=0.649, adj=0.091, (0 split)
chlorides < 0.0515 to the left, agree=0.649, adj=0.091, (0 split)
Node number 57: 10 observations
predicted class=1 expected loss=0.1 P(node) =0.03134796
class counts: 1 9
probabilities: 0.100 0.900
Node number 78: 8 observations
predicted class=0 expected loss=0.25 P(node) =0.02507837
class counts: 6 2
probabilities: 0.750 0.250
Node number 79: 23 observations
predicted class=1 expected loss=0.2608696 P(node) =0.07210031
class counts: 6 17
probabilities: 0.261 0.739
Node number 112: 22 observations, complexity param=0.01119403
predicted class=0 expected loss=0.3636364 P(node) =0.06896552
class counts: 14 8
probabilities: 0.636 0.364
left son=224 (14 obs) right son=225 (8 obs)
Primary splits:
total.sulfur.dioxide < 19.5 to the right, improve=3.753247, (0 missing)
volatile.acidity < 0.36 to the right, improve=2.524675, (0 missing)
free.sulfur.dioxide < 6.5 to the right, improve=2.524675, (0 missing)
sulphates < 0.605 to the right, improve=2.048485, (0 missing)
density < 0.99631 to the right, improve=1.717532, (0 missing)
Surrogate splits:
free.sulfur.dioxide < 6.5 to the right, agree=0.955, adj=0.875, (0 split)
volatile.acidity < 0.385 to the right, agree=0.909, adj=0.750, (0 split)
chlorides < 0.065 to the right, agree=0.773, adj=0.375, (0 split)
alcohol < 10.65 to the left, agree=0.773, adj=0.375, (0 split)
fixed.acidity < 8.95 to the left, agree=0.727, adj=0.250, (0 split)
Node number 113: 35 observations, complexity param=0.01119403
predicted class=1 expected loss=0.3428571 P(node) =0.1097179
class counts: 12 23
probabilities: 0.343 0.657
left son=226 (13 obs) right son=227 (22 obs)
Primary splits:
volatile.acidity < 0.6375 to the right, improve=3.072128, (0 missing)
chlorides < 0.0925 to the right, improve=2.414286, (0 missing)
sulphates < 0.595 to the left, improve=1.904762, (0 missing)
density < 0.99563 to the left, improve=1.651058, (0 missing)
total.sulfur.dioxide < 42 to the left, improve=1.477722, (0 missing)
Surrogate splits:
free.sulfur.dioxide < 10.5 to the left, agree=0.743, adj=0.308, (0 split)
total.sulfur.dioxide < 33 to the left, agree=0.714, adj=0.231, (0 split)
sulphates < 0.465 to the left, agree=0.686, adj=0.154, (0 split)
citric.acid < 0.005 to the left, agree=0.657, adj=0.077, (0 split)
residual.sugar < 2.05 to the left, agree=0.657, adj=0.077, (0 split)
Node number 224: 14 observations
predicted class=0 expected loss=0.1428571 P(node) =0.04388715
class counts: 12 2
probabilities: 0.857 0.143
Node number 225: 8 observations
predicted class=1 expected loss=0.25 P(node) =0.02507837
class counts: 2 6
probabilities: 0.250 0.750
Node number 226: 13 observations
predicted class=0 expected loss=0.3846154 P(node) =0.04075235
class counts: 8 5
probabilities: 0.615 0.385
Node number 227: 22 observations
predicted class=1 expected loss=0.1818182 P(node) =0.06896552
class counts: 4 18
probabilities: 0.182 0.818
n= 319
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 319 134 1 (0.42006270 0.57993730)
2) alcohol< 9.95 132 42 0 (0.68181818 0.31818182)
4) fixed.acidity< 9.95 114 28 0 (0.75438596 0.24561404)
8) total.sulfur.dioxide>=51.5 49 4 0 (0.91836735 0.08163265) *
9) total.sulfur.dioxide< 51.5 65 24 0 (0.63076923 0.36923077)
18) pH< 3.295 20 3 0 (0.85000000 0.15000000) *
19) pH>=3.295 45 21 0 (0.53333333 0.46666667)
38) volatile.acidity>=0.6725 14 2 0 (0.85714286 0.14285714) *
39) volatile.acidity< 0.6725 31 12 1 (0.38709677 0.61290323)
78) alcohol< 9.35 8 2 0 (0.75000000 0.25000000) *
79) alcohol>=9.35 23 6 1 (0.26086957 0.73913043) *
5) fixed.acidity>=9.95 18 4 1 (0.22222222 0.77777778) *
3) alcohol>=9.95 187 44 1 (0.23529412 0.76470588)
6) volatile.acidity>=0.79 8 1 0 (0.87500000 0.12500000) *
7) volatile.acidity< 0.79 179 37 1 (0.20670391 0.79329609)
14) alcohol< 11.15 104 32 1 (0.30769231 0.69230769)
28) sulphates< 0.69 67 27 1 (0.40298507 0.59701493)
56) fixed.acidity>=6.85 57 26 1 (0.45614035 0.54385965)
112) volatile.acidity< 0.4725 22 8 0 (0.63636364 0.36363636)
224) total.sulfur.dioxide>=19.5 14 2 0 (0.85714286 0.14285714) *
225) total.sulfur.dioxide< 19.5 8 2 1 (0.25000000 0.75000000) *
113) volatile.acidity>=0.4725 35 12 1 (0.34285714 0.65714286)
226) volatile.acidity>=0.6375 13 5 0 (0.61538462 0.38461538) *
227) volatile.acidity< 0.6375 22 4 1 (0.18181818 0.81818182) *
57) fixed.acidity< 6.85 10 1 1 (0.10000000 0.90000000) *
29) sulphates>=0.69 37 5 1 (0.13513514 0.86486486) *
15) alcohol>=11.15 75 5 1 (0.06666667 0.93333333) *
[1] "Non-pruned Tree without CV"
[1] "Training Redistribution Error = 0.144"
Classification tree:
rpart(formula = quality ~ ., data = wine_train, method = "class",
model = TRUE, parms = list(split = "gini"), xval = 10)
Variables actually used in tree construction:
[1] alcohol fixed.acidity pH
[4] sulphates total.sulfur.dioxide volatile.acidity
Root node error: 134/319 = 0.42006
n= 319
CP nsplit rel error xerror xstd
1 0.358209 0 1.00000 1.00000 0.065787
2 0.074627 1 0.64179 0.64179 0.059146
3 0.044776 2 0.56716 0.61940 0.058478
4 0.017413 3 0.52239 0.61940 0.058478
5 0.011194 7 0.44030 0.68657 0.060382
6 0.010000 13 0.34328 0.71642 0.061135
Error in fancyRpartPlot(dtnp, uniform = TRUE, main = "Non-Pruned Classification Tree"): could not find function "fancyRpartPlot" Traceback:
Cross validation before pruning
# specify parameters for cross validation
control <- trainControl(method = "repeatedcv",
number = 10, # number of folds
repeats = 3, # repeat times
search = "grid")
dtcv <- train(quality ~ .,
data = wine_train,
method = "rpart",
trControl = control)
print.train(dtcv)
rpart.plot(dtcv$finalModel, extra = 106)
plot.train(dtcv)
# Test
pred_test <- predict(dtnp, newdata = wine_test, type = "class")
#Confusion Matrix
cmdt_test <- confusionMatrix(as.factor(pred_test), wine_test$quality)
accdtst <- cmdt_test$overall["Accuracy"]
print(paste('LG CV Test Error = ', round(1 - accdtst['Accuracy'],4)))
CART 319 samples 11 predictor 2 classes: '0', '1' No pre-processing Resampling: Cross-Validated (10 fold, repeated 3 times) Summary of sample sizes: 287, 288, 288, 286, 287, 287, ... Resampling results across tuning parameters: cp Accuracy Kappa 0.04477612 0.7325513 0.4414546 0.07462687 0.7270446 0.4353154 0.35820896 0.6297990 0.1813829 Accuracy was used to select the optimal model using the largest value. The final value used for the model was cp = 0.04477612.
[1] "LG CV Test Error = 0.307"
Pruning
opt <- which.min(dtnp$cptable[, "xerror"]);
cp1 <- dtnp$cptable[opt, "CP"];
print(paste('Optimal cp = ', cp1))
dtpr <- prune(dtnp,cp=cp1);
#fancyRpartPlot(dtpr, uniform=TRUE, main="Pruned Tree")
dtPrednp <- predict(dtpr, newdata = wine_train, type = "class")
#Testing
predpr_test <- predict(dtpr, newdata = wine_test, type = "class")
#Confusion Matrix
#Testing
cmdtpr_test <- confusionMatrix(as.factor(predpr_test), wine_test$quality)
accdtstpr <- cmdtpr_test$overall["Accuracy"]
print(paste('DT CV Test Error = ', round(1 - accdtstpr['Accuracy'],4)))
#Confusion Matrix
#Training
cmdtpr <- confusionMatrix(as.factor(dtPrednp), wine_train$quality)
accdtpr <- cmdtpr$overall["Accuracy"]
print(paste('DT CV Train Error = ', round(1 - accdtpr['Accuracy'],4)))
rpart.plot(dtpr, yesno = TRUE)
#fancyRpartPlot(dtpr, uniform=TRUE, main="Pruned Classification Tree")
[1] "Optimal cp = 0.0447761194029851" [1] "DT CV Test Error = 0.3062" [1] "DT CV Train Error = 0.2382"
Implementation of Monte Carlos CV for: Logistic Regression, KNN, SVM, Ordinal Regression, Random Forest, Decision Tree¶
set.seed(7406)
testspl <- 0.2
n = dim(wine)[1]
n1 = round(n * testspl)
B= 100
b_index <- list()
te1 <- list()
te2 <- list()
te3 <- list()
te4 <- list()
te5 <- list()
te6 <- list()
te7 <- list()
for (b in 1:B) {
### randomly select n1 observations as a new training subset in each loop
flag <- sort(sample(1:n, n1));
wine_train_temp <- wine[-flag,]; ## temp training set for CV
wine_test_temp <- wine[flag,]; ## temp testing set for CV
b_index <- c(b_index, b)
# Model 1: Logistic Regression with All Predictors
lgall <- glm(quality ~ ., data = wine_train_temp, family = binomial(link="logit"))
prediction <- predict(lgall,wine_test_temp[,1:11], type = "response")
prediction <- ifelse(prediction>= .5, 1,0)
cm <- confusionMatrix(wine_test_temp$quality, as.factor(prediction))
te1 <- c(te1,round(cm$overall[['Accuracy']],3))
# Model 2: Logistic Regression with Significant Predictors
lgselect <- glm(quality ~ .- residual.sugar - fixed.acidity - density -
citric.acid - pH,
data = wine,
family = binomial(link="logit"))
prediction <- predict(lgselect,wine_test_temp[,1:11], type = "response")
prediction <- ifelse(prediction>= .5, 1,0)
cm <- confusionMatrix(wine_test_temp$quality, as.factor(prediction))
te2 <- c(te2,round(cm$overall[['Accuracy']],3))
# Model 3: Knn at k = 3
knn <- knn(train=wine_train_temp, test=wine_test_temp,
cl=wine_train_temp$quality, k=3)
cm <- confusionMatrix(wine_test_temp$quality, as.factor(knn))
te3 <- c(te3,round(cm$overall[['Accuracy']],3))
# Model 4: SVM C = 3
classifier = svm(formula = quality ~ .,data = wine_train_temp,type =
'C-classification', cost = c_best, kernel = 'radial')
predictions <- predict(classifier, newdata = wine_test_temp[,1:11])
cm <- confusionMatrix(wine_test_temp$quality, as.factor(predictions))
te4 <- c(te4,round(cm$overall[['Accuracy']],3))
# Model 5: Random Forest - mtry
rf <- randomForest(quality~., data=wine_train_temp, proximity=TRUE, mtry = best.m)
predictions <- predict(rf, wine_test_temp)
cm <- confusionMatrix(predictions, wine_test_temp$quality)
te5 <- c(te5,round(cm$overall[['Accuracy']],3))
# Model 6: Decision Tree - cp
fit <- rpart(quality~., data = wine_train_temp, method = 'class')
dtpr <- prune(fit,cp=cp1)
predictions <- predict(dtpr, wine_test_temp, type = 'class')
cm <- confusionMatrix(wine_test_temp$quality, as.factor(predictions))
te6 <- c(te6,round(cm$overall[['Accuracy']],3))
}
TEALL <- data.frame(unlist(te1), unlist(te2), unlist(te3), unlist(te4),
unlist(te5), unlist(te6))
colnames(TEALL) <- c("LogReg", "LogRegSelect", "Knn", "Svm", "RF", "DTree")
results_viz = data.frame(unlist(apply(TEALL, 2, mean)))
results_viz
| unlist.apply.TEALL..2..mean.. | |
|---|---|
| LogReg | 0.74179 |
| LogRegSelect | 0.74913 |
| Knn | 0.75220 |
| Svm | 0.76776 |
| RF | 0.81840 |
| DTree | 0.68710 |
BOOSTING
All variables
# Boosting
set.seed(7406)
#wine <- rbind(white, red)
wine <- red
wine.pr <- wine[1:11]
#wine <- read.csv(file = "winequality_red.csv", head = TRUE, sep=";")
wine$quality <- ifelse(wine$quality>= 6, 1,0)
#Random sampling
set.seed(7406)
test <- 0.8
ff <- floor((1 - test)*nrow(wine))
ind <- sample(nrow(wine), ff, replace = FALSE)
# Training and Test datasets
wine_train <- wine[ind,]
wine_test <- wine[-ind,]
gbm.train <- gbm(quality ~ .,
data=wine_train,
distribution = 'bernoulli',
n.trees = 500,
shrinkage = 0.01,
interaction.depth = 3,
cv.folds = 5,
n.cores = NULL,verbose = FALSE)
## Model Inspection
## Find the estimated optimal number of iterations
perf_gbm1 = gbm.perf(gbm.train, method="cv")
perf_gbm1
summary(gbm.train)
| var | rel.inf | |
|---|---|---|
| alcohol | alcohol | 26.830706 |
| sulphates | sulphates | 17.951417 |
| volatile.acidity | volatile.acidity | 10.954048 |
| total.sulfur.dioxide | total.sulfur.dioxide | 8.021901 |
| chlorides | chlorides | 7.581519 |
| pH | pH | 6.381061 |
| density | density | 6.001086 |
| fixed.acidity | fixed.acidity | 5.389601 |
| residual.sugar | residual.sugar | 4.237370 |
| citric.acid | citric.acid | 4.205721 |
| free.sulfur.dioxide | free.sulfur.dioxide | 2.445569 |
## Training error
pred1gbm <- predict(gbm.train,newdata = wine_train, n.trees=perf_gbm1, type="response")
y1hat <- ifelse(pred1gbm < 0.5, 0, 1)
bst_train_err <- round(mean(y1hat != wine_train$quality),4)
print(paste('Boost Training Error = ', bst_train_err))
## Testing Error
y2hat <- ifelse(predict(gbm.train,newdata = wine_test, n.trees=perf_gbm1, type="response") < 0.5, 0, 1)
bst_test_err <- round(mean(y2hat != wine_test$quality),4)
print(paste('Boost Testing Error = ', bst_test_err))
[1] "Boost Training Error = 0.2006" [1] "Boost Testing Error = 0.2586"
Only SIGNIFICANT variables
gbm.sig <- gbm(quality ~ . - residual.sugar - fixed.acidity - density - citric.acid - pH,
data=wine_train,
distribution = 'bernoulli',
n.trees = 500,
shrinkage = 0.01,
interaction.depth = 3,
cv.folds = 5,
n.cores = NULL,verbose = FALSE)
## Model Inspection
## Find the estimated optimal number of iterations
perf_gbm2 = gbm.perf(gbm.sig, method="cv")
perf_gbm2
## Training error
pred11gbm <- predict(gbm.sig,newdata = wine_train, n.trees=perf_gbm2, type="response")
y11hat <- ifelse(pred11gbm < 0.5, 0, 1)
bst1_train_err <- round(mean(y11hat != wine_train$quality),4)
print(paste('Boost Training Error = ', bst1_train_err))
## Testing Error
y22hat <- ifelse(predict(gbm.sig,newdata = wine_test, n.trees=perf_gbm2, type="response") < 0.5, 0, 1)
bst2_test_err <- round(mean(y22hat != wine_test$quality),4)
print(paste('Boost Testing Error = ', bst2_test_err))
[1] "Boost Training Error = 0.2163" [1] "Boost Testing Error = 0.2695"
dim(wine_test_temp)[1]/dim(wine_train_temp)[1]